import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
import json
import plotly.express as px
from utils.reductors import *
from utils.utils import plot_dim_reduction, summarize_dataset, RF_score, RF_importance, prepare_dataset
from collections import namedtuple
from sklearn.model_selection import KFold, cross_val_score
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from sklearn.ensemble import RandomForestRegressor
import importlib
%load_ext autoreload
%autoreload 2
Clean and Visualize data¶
Проверим, что нет Nan, неожиданных нулей или больших выбросов
df = pd.read_csv('data/data_1.csv')
summarize_dataset(df)
| Feature | Unique Count | NaN Count | Nulls Count | Type | Min | Max | Most Frequent | Least Frequent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | snippet_id | 70 | 0 | (0,) | int64 | 11.000000 | 202.000000 | 11.000000 | None |
| 1 | score | 28 | 0 | (0,) | float64 | 0.400000 | 1.000000 | 0.700000 | None |
| 2 | mark | 17 | 0 | (12,) | float64 | 0.000000 | 1.000000 | 0.500000 | None |
| 3 | lines_min | 23 | 0 | (0,) | int64 | 1.000000 | 99.000000 | 3.000000 | None |
| 4 | lines_max | 35 | 0 | (0,) | int64 | 4.000000 | 99.000000 | 9.000000 | None |
| 5 | lines_mean | 58 | 0 | (0,) | float64 | 3.500000 | 99.000000 | 8.000000 | None |
| 6 | lines_count | 11 | 0 | (0,) | int64 | 1.000000 | 17.000000 | 2.000000 | None |
| 7 | maxNesting_min | 6 | 0 | (46,) | int64 | 0.000000 | 11.000000 | 0.000000 | None |
| 8 | maxNesting_max | 7 | 0 | (7,) | int64 | 0.000000 | 11.000000 | 2.000000 | None |
| 9 | maxNesting_mean | 24 | 0 | (7,) | float64 | 0.000000 | 11.000000 | 1.000000 | None |
| 10 | maxAstDistance_min | 11 | 0 | (11,) | int64 | 0.000000 | 19.000000 | 2.000000 | None |
| 11 | maxAstDistance_max | 16 | 0 | (0,) | int64 | 2.000000 | 19.000000 | 3.000000 | None |
| 12 | maxAstDistance_mean | 40 | 0 | (0,) | float64 | 0.666667 | 19.000000 | 2.000000 | None |
| 13 | halstead_min | 62 | 0 | (0,) | float64 | 15.509775 | 4433.160733 | 15.509775 | None |
| 14 | halstead_max | 69 | 0 | (0,) | float64 | 77.661794 | 4433.160733 | 252.615012 | None |
| 15 | halstead_mean | 70 | 0 | (0,) | float64 | 42.365537 | 4433.160733 | 42.365537 | None |
| 16 | entropy_min | 64 | 0 | (0,) | float64 | 2.584963 | 15.709933 | 2.584963 | None |
| 17 | entropy_max | 69 | 0 | (0,) | float64 | 4.288642 | 15.709933 | 6.571583 | None |
| 18 | entropy_mean | 70 | 0 | (0,) | float64 | 3.450244 | 15.709933 | 3.450244 | None |
| 19 | readability_score | 70 | 0 | (0,) | float64 | 0.000802 | 1.000000 | 0.000802 | None |
train, test = train_test_split(df, test_size=0.07, random_state=42)
rows_to_append = train[train['snippet_id'].isin([58, 12, 155, 18, 168])]
test = pd.concat([test, rows_to_append])
train = train[~train['snippet_id'].isin([58, 12, 155, 18, 168])]
Вытащим на этом этапе тестирующий набор, чтобы предотвратить data leak при отборе признаков и feature engineering. Для тестирования во время отбора модельки будем использовать cross val, поскольку у нас всего 63(train+val) + 7(test).
train.hist(bins=10, figsize=(20,15))
plt.show()
array([[<Axes: title={'center': 'snippet_id'}>,
<Axes: title={'center': 'score'}>,
<Axes: title={'center': 'mark'}>,
<Axes: title={'center': 'lines_min'}>],
[<Axes: title={'center': 'lines_max'}>,
<Axes: title={'center': 'lines_mean'}>,
<Axes: title={'center': 'lines_count'}>,
<Axes: title={'center': 'maxNesting_min'}>],
[<Axes: title={'center': 'maxNesting_max'}>,
<Axes: title={'center': 'maxNesting_mean'}>,
<Axes: title={'center': 'maxAstDistance_min'}>,
<Axes: title={'center': 'maxAstDistance_max'}>],
[<Axes: title={'center': 'maxAstDistance_mean'}>,
<Axes: title={'center': 'halstead_min'}>,
<Axes: title={'center': 'halstead_max'}>,
<Axes: title={'center': 'halstead_mean'}>],
[<Axes: title={'center': 'entropy_min'}>,
<Axes: title={'center': 'entropy_max'}>,
<Axes: title={'center': 'entropy_mean'}>,
<Axes: title={'center': 'readability_score'}>]], dtype=object)
Есть несколько хвостов вправо.
When we compare score and mark, mark more center disributed and symmetric (we want belive that dataset center distributed). Но смущает, что там всего 17 против 28 уникальных значений (возможно это пофиксится, если будет больше оценок респондентов).
params = RF_score(train, [])
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 22
params: {'n_estimators': 12, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 2}
-rmse, -mae, r2: [-0.10828408761504897, -0.07979657685944072, -0.09636127228963243]
RF_importance(train, [], params)
['maxAstDistance_max', 'maxNesting_mean', 'halstead_min', 'entropy_min', 'maxAstDistance_mean', 'halstead_mean', 'lines_max', 'lines_mean', 'halstead_max', 'maxAstDistance_min', 'maxNesting_max', 'maxNesting_min', 'lines_count', 'lines_min', 'entropy_max', 'entropy_mean']
Вполне интерпритируемо. Но не хорошо, что lines и др могут давать выбросы и очень большин значения. Попробуем сначала отнормировать, и если не получится искать еще признаки.
Feature Engineering¶
Для этого подойдет log1p и x/(x+1).
columns_to_transform = [
'lines_min', 'lines_max', 'lines_mean', 'lines_count',
'maxNesting_min', 'maxNesting_max', 'maxNesting_mean',
'maxAstDistance_min', 'maxAstDistance_max', 'maxAstDistance_mean',
'halstead_min', 'halstead_max', 'halstead_mean', 'entropy_min',
'entropy_max', 'entropy_mean'
]
log_train = train[['score', 'mark', 'readability_score', 'snippet_id']].copy()
transf_train = train[['score', 'mark', 'readability_score', 'snippet_id']].copy()
for col in columns_to_transform:
log_train[f'log_{col}'] = np.log1p(train[col].values)
transf_train[f'transf_{col}'] = train[col].values / (train[col].values + 1)
log_train.hist(bins=10, figsize=(20,15))
plt.show()
array([[<Axes: title={'center': 'score'}>,
<Axes: title={'center': 'mark'}>,
<Axes: title={'center': 'readability_score'}>,
<Axes: title={'center': 'snippet_id'}>],
[<Axes: title={'center': 'log_lines_min'}>,
<Axes: title={'center': 'log_lines_max'}>,
<Axes: title={'center': 'log_lines_mean'}>,
<Axes: title={'center': 'log_lines_count'}>],
[<Axes: title={'center': 'log_maxNesting_min'}>,
<Axes: title={'center': 'log_maxNesting_max'}>,
<Axes: title={'center': 'log_maxNesting_mean'}>,
<Axes: title={'center': 'log_maxAstDistance_min'}>],
[<Axes: title={'center': 'log_maxAstDistance_max'}>,
<Axes: title={'center': 'log_maxAstDistance_mean'}>,
<Axes: title={'center': 'log_halstead_min'}>,
<Axes: title={'center': 'log_halstead_max'}>],
[<Axes: title={'center': 'log_halstead_mean'}>,
<Axes: title={'center': 'log_entropy_min'}>,
<Axes: title={'center': 'log_entropy_max'}>,
<Axes: title={'center': 'log_entropy_mean'}>]], dtype=object)
params = RF_score(log_train, [])
log_params = RF_importance(log_train, [], params)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 22
params: {'n_estimators': 18, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 2}
-rmse, -mae, r2: [-0.10918378521713272, -0.07988756500844964, -0.15071227455966268]
transf_train.hist(bins=10, figsize=(20,15))
plt.show()
params = RF_score(transf_train, [])
transf_params = RF_importance(transf_train, [], params)
array([[<Axes: title={'center': 'score'}>,
<Axes: title={'center': 'mark'}>,
<Axes: title={'center': 'readability_score'}>,
<Axes: title={'center': 'snippet_id'}>],
[<Axes: title={'center': 'transf_lines_min'}>,
<Axes: title={'center': 'transf_lines_max'}>,
<Axes: title={'center': 'transf_lines_mean'}>,
<Axes: title={'center': 'transf_lines_count'}>],
[<Axes: title={'center': 'transf_maxNesting_min'}>,
<Axes: title={'center': 'transf_maxNesting_max'}>,
<Axes: title={'center': 'transf_maxNesting_mean'}>,
<Axes: title={'center': 'transf_maxAstDistance_min'}>],
[<Axes: title={'center': 'transf_maxAstDistance_max'}>,
<Axes: title={'center': 'transf_maxAstDistance_mean'}>,
<Axes: title={'center': 'transf_halstead_min'}>,
<Axes: title={'center': 'transf_halstead_max'}>],
[<Axes: title={'center': 'transf_halstead_mean'}>,
<Axes: title={'center': 'transf_entropy_min'}>,
<Axes: title={'center': 'transf_entropy_max'}>,
<Axes: title={'center': 'transf_entropy_mean'}>]], dtype=object)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 22
params: {'n_estimators': 18, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 2}
-rmse, -mae, r2: [-0.1091760346107038, -0.07988756500844964, -0.1499807379878978]
Логарифмированные фичи выглядят лучше. У x/(x+1) есть преимущество, что не будет выбросов поскольку все значения лежат от 0 до 1.
Контекст¶
В голову приходит добавление какой-либо плотности на строку. То есть у нас есть halsteadVolume = tokens.values.sum() * log2(tokens.size.toDouble()) и entopy = - sum_{i=1}^{n} P(x_i) \log_2 P(x_i), где P(x_i) - вероятность токена x_i. Пусть будет временная мера, пока у нас нет количества операндов на строку или чего-то похожего. Попробуем halsteadVolume*entopy/lines и (halsteadVolume+entopy)/lines для каждого метода и для них также возьмем min, max, mean.
feat = pd.read_csv('data/data_1_feat.csv')
log_train_feat = pd.merge(log_train, feat, on='snippet_id')
log_train_feat.hist(bins=10, figsize=(20,15))
plt.show()
params = RF_score(log_train_feat, [])
log_favorite_features = RF_importance(log_train_feat, [], params)
array([[<Axes: title={'center': 'score'}>,
<Axes: title={'center': 'mark'}>,
<Axes: title={'center': 'readability_score'}>,
<Axes: title={'center': 'snippet_id'}>,
<Axes: title={'center': 'log_lines_min'}>],
[<Axes: title={'center': 'log_lines_max'}>,
<Axes: title={'center': 'log_lines_mean'}>,
<Axes: title={'center': 'log_lines_count'}>,
<Axes: title={'center': 'log_maxNesting_min'}>,
<Axes: title={'center': 'log_maxNesting_max'}>],
[<Axes: title={'center': 'log_maxNesting_mean'}>,
<Axes: title={'center': 'log_maxAstDistance_min'}>,
<Axes: title={'center': 'log_maxAstDistance_max'}>,
<Axes: title={'center': 'log_maxAstDistance_mean'}>,
<Axes: title={'center': 'log_halstead_min'}>],
[<Axes: title={'center': 'log_halstead_max'}>,
<Axes: title={'center': 'log_halstead_mean'}>,
<Axes: title={'center': 'log_entropy_min'}>,
<Axes: title={'center': 'log_entropy_max'}>,
<Axes: title={'center': 'log_entropy_mean'}>],
[<Axes: title={'center': 'sum_feat_min'}>,
<Axes: title={'center': 'sum_feat_max'}>,
<Axes: title={'center': 'sum_feat_mean'}>,
<Axes: title={'center': 'mult_feat_min'}>,
<Axes: title={'center': 'mult_feat_max'}>],
[<Axes: title={'center': 'mult_feat_mean'}>, <Axes: >, <Axes: >,
<Axes: >, <Axes: >]], dtype=object)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 17
params: {'n_estimators': 7, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}
-rmse, -mae, r2: [-0.11374051679479683, -0.07669385420145354, -0.24276329995734777]
transf_train_feat = pd.merge(transf_train, feat, on='snippet_id')
transf_train_feat.hist(bins=10, figsize=(20,15))
plt.show()
params = RF_score(transf_train_feat, [])
transf_features_to_emb = RF_importance(transf_train_feat, [], params)
array([[<Axes: title={'center': 'score'}>,
<Axes: title={'center': 'mark'}>,
<Axes: title={'center': 'readability_score'}>,
<Axes: title={'center': 'snippet_id'}>,
<Axes: title={'center': 'transf_lines_min'}>],
[<Axes: title={'center': 'transf_lines_max'}>,
<Axes: title={'center': 'transf_lines_mean'}>,
<Axes: title={'center': 'transf_lines_count'}>,
<Axes: title={'center': 'transf_maxNesting_min'}>,
<Axes: title={'center': 'transf_maxNesting_max'}>],
[<Axes: title={'center': 'transf_maxNesting_mean'}>,
<Axes: title={'center': 'transf_maxAstDistance_min'}>,
<Axes: title={'center': 'transf_maxAstDistance_max'}>,
<Axes: title={'center': 'transf_maxAstDistance_mean'}>,
<Axes: title={'center': 'transf_halstead_min'}>],
[<Axes: title={'center': 'transf_halstead_max'}>,
<Axes: title={'center': 'transf_halstead_mean'}>,
<Axes: title={'center': 'transf_entropy_min'}>,
<Axes: title={'center': 'transf_entropy_max'}>,
<Axes: title={'center': 'transf_entropy_mean'}>],
[<Axes: title={'center': 'sum_feat_min'}>,
<Axes: title={'center': 'sum_feat_max'}>,
<Axes: title={'center': 'sum_feat_mean'}>,
<Axes: title={'center': 'mult_feat_min'}>,
<Axes: title={'center': 'mult_feat_max'}>],
[<Axes: title={'center': 'mult_feat_mean'}>, <Axes: >, <Axes: >,
<Axes: >, <Axes: >]], dtype=object)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 17
params: {'n_estimators': 7, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}
-rmse, -mae, r2: [-0.11374051679479683, -0.07669385420145354, -0.24276329995734777]
Стало чуть легче интерпритировать (мы должны будем написать объяснения и рекомендации пользователям), но скор немного вырос. Попробуем ограничить количество фич.
fig = px.scatter_3d(transf_train_feat, x="mult_feat_max", y="transf_entropy_max", z="transf_maxNesting_max", color='score')
_ = fig.update_traces(marker_size=2.5)
fig.show()
fig = px.scatter_3d(log_train_feat, x="log_lines_mean", y="log_maxNesting_mean", z="mult_feat_min", color='score')
_ = fig.update_traces(marker_size=2.5)
fig.show()
Разделить на кластеры по сырым признакам не получилось. Максимум, который получился в разных комбинациях - это увидеть что в некоторых плоскостях к центру читаемость в среднем увеличивается. Можно в будущем добавить как фичу расстояние до центра. Попробуем различные map reducers для новых фич.
Reduce dimensions¶
log_favorite_features = log_favorite_features[0:9]
transf_features_to_emb= transf_features_to_emb[0:9]
from collections import namedtuple
hue_info = namedtuple('hue_info', ['field_name', 'is_categorical'])
def log_plot_standard_embeddings(plot_data, features, return_results=False):
mapper_dict = {
'tsne 3D perplexity 2 exaggeration 6': {
'params': {
'perplexity': 2,
'verbose': False,
'n_components': 3,
'exaggeration': 6,
'dof': 1,
},
'func': make_tsne,
}
}
res = plot_dim_reduction(
data=plot_data,
mapper_dict=mapper_dict,
default_features=features,
default_hue_info=hue_info(field_name='score', is_categorical=False),
row_height=450,
return_results=return_results
)
return res
log_res = log_plot_standard_embeddings(log_train_feat, log_favorite_features, return_results=True)
from collections import namedtuple
hue_info = namedtuple('hue_info', ['field_name', 'is_categorical'])
def transf_plot_standard_embeddings(plot_data, features, return_results=False):
mapper_dict = {
'tsne 3D perplexity 2 exaggeration 5': {
'params': {
'perplexity': 2,
'verbose': False,
'n_components': 3,
'exaggeration': 5,
'dof': 1,
},
'func': make_tsne,
},
}
res = plot_dim_reduction(
data=plot_data,
mapper_dict=mapper_dict,
default_features=features,
default_hue_info=hue_info(field_name='score', is_categorical=False),
row_height=450,
return_results=return_results
)
return res
transf_res = transf_plot_standard_embeddings(transf_train_feat, transf_features_to_emb, return_results=True)
Получилось сделать так, чтобы почти везде у элементов из одного кластера была разница не больше 0.2-0.24 (если взять среднее, то ошибка все еще будет меньше чем у старой модели).
log_train_feat['tsne-x'] = log_res['tsne 3D perplexity 2 exaggeration 6']['embedding'][:, 0]
log_train_feat['tsne-y'] = log_res['tsne 3D perplexity 2 exaggeration 6']['embedding'][:, 1]
log_train_feat['tsne-z'] = log_res['tsne 3D perplexity 2 exaggeration 6']['embedding'][:, 2]
transf_train_feat['tsne-x'] = transf_res['tsne 3D perplexity 2 exaggeration 5']['embedding'][:, 0]
transf_train_feat['tsne-y'] = transf_res['tsne 3D perplexity 2 exaggeration 5']['embedding'][:, 1]
transf_train_feat['tsne-z'] = transf_res['tsne 3D perplexity 2 exaggeration 5']['embedding'][:, 2]
Correlation matrix
fig = plt.figure(figsize=(14, 8))
train_corr = log_train_feat.drop(['snippet_id'], axis=1).corr()
sns.heatmap(train_corr, cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Correlation matrix")
plt.show()
<Axes: >
Text(0.5, 1.0, 'Correlation matrix')
Corelation between score and mark not so high. But both of them not correlated with previous model.
Выберем интересные столбцы: ['log_halstead_max', 'log_lines_max', 'log_maxNesting_min', 'log_maxNesting_max', 'log_maxAstDistance_max', 'log_entropy_max', 'mult_feat_max', 'tsne-y']
log_train_feat_dropped = log_train_feat[['log_halstead_max', 'log_lines_max','log_maxNesting_max', 'log_maxAstDistance_max', 'log_entropy_max', 'tsne-y', 'score', 'mark', 'readability_score', 'snippet_id']]
log_params = RF_score(log_train_feat_dropped, [])
RF_importance(log_train_feat_dropped, [], params)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 22
params: {'n_estimators': 10, 'max_depth': 2, 'min_samples_split': 7, 'min_samples_leaf': 2}
-rmse, -mae, r2: [-0.10717455189827318, -0.07350621737696028, -0.07916062383541451]
['tsne-y', 'log_maxAstDistance_max', 'log_entropy_max', 'log_lines_max', 'log_maxNesting_max', 'log_halstead_max']
fig = plt.figure(figsize=(14, 8))
train_corr = transf_train_feat.corr()
sns.heatmap(train_corr, cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Correlation matrix")
plt.show()
<Axes: >
Text(0.5, 1.0, 'Correlation matrix')
transf_params = RF_score(transf_train_feat, [])
RF_importance(transf_train_feat, [], params)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 34
params: {'n_estimators': 20, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 6}
-rmse, -mae, r2: [-0.11056188976705332, -0.0854946182459751, -0.06221973012468429]
['transf_maxAstDistance_max', 'tsne-y', 'mult_feat_max', 'sum_feat_mean', 'transf_entropy_mean', 'sum_feat_min', 'transf_halstead_mean', 'transf_lines_mean', 'transf_maxNesting_mean', 'transf_entropy_max', 'tsne-z', 'transf_halstead_max', 'sum_feat_max', 'tsne-x', 'transf_lines_max', 'transf_lines_min', 'mult_feat_mean', 'transf_lines_count', 'transf_maxNesting_min', 'transf_entropy_min', 'mult_feat_min', 'transf_maxAstDistance_mean', 'transf_halstead_min', 'transf_maxAstDistance_min', 'transf_maxNesting_max']
Выберем столбцы, чтобы не уменьшить скор:
columns_to_train = ['transf_maxAstDistance_max',
'transf_maxNesting_min',
'tsne-x',
'sum_feat_min',
'tsne-y',
'transf_maxAstDistance_mean',
'transf_maxNesting_max',
'transf_lines_mean',
'sum_feat_max',
'sum_feat_mean',
'transf_maxNesting_mean', 'score', 'mark', 'readability_score', 'snippet_id']
transf_train_feat_dropped = transf_train_feat[columns_to_train]
transf_params = RF_score(transf_train_feat_dropped, [])
RF_importance(transf_train_feat_dropped, [], params)
0%| | 0/40 [00:00<?, ?it/s]
Trial with highest accuracy:
number: 2
params: {'n_estimators': 4, 'max_depth': 2, 'min_samples_split': 8, 'min_samples_leaf': 10}
-rmse, -mae, r2: [-0.12388285956101149, -0.08744536315864608, -0.3867650421596122]
['tsne-x', 'transf_maxAstDistance_max', 'sum_feat_min', 'sum_feat_mean', 'tsne-y', 'transf_maxNesting_mean', 'transf_lines_mean', 'sum_feat_max', 'transf_maxNesting_max', 'transf_maxAstDistance_mean', 'transf_maxNesting_min']
columns_to_train = [item for item in columns_to_train if item not in ['mark', 'readability_score', 'snippet_id']]
Prepare test dataset¶
feat_test = pd.merge(test, feat, on='snippet_id')
X_test, y_test = prepare_dataset(feat_test, columns_to_transform, transf_features_to_emb, transf_plot_standard_embeddings, columns_to_train)
0%| | 0/1 [00:00<?, ?it/s]C:\Users\admin\AppData\Local\pypoetry\Cache\virtualenvs\model-01mPcN-W-py3.10\lib\site-packages\threadpoolctl.py:1214: RuntimeWarning:
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md
100%|██████████| 1/1 [00:00<00:00, 2.26it/s]
HBox(children=(FigureWidget({
'data': [{'hovertemplate': 'x=%{x}<br>y=%{y}<br>z=%{z}<br>score=%{marker.col…
Проверим, что нет неожиданностей
X_test.hist(bins=10, figsize=(20,15))
plt.show()
array([[<Axes: title={'center': 'transf_maxAstDistance_max'}>,
<Axes: title={'center': 'transf_maxNesting_min'}>,
<Axes: title={'center': 'tsne-x'}>],
[<Axes: title={'center': 'sum_feat_min'}>,
<Axes: title={'center': 'tsne-y'}>,
<Axes: title={'center': 'transf_maxAstDistance_mean'}>],
[<Axes: title={'center': 'transf_maxNesting_max'}>,
<Axes: title={'center': 'transf_lines_mean'}>,
<Axes: title={'center': 'sum_feat_max'}>],
[<Axes: title={'center': 'sum_feat_mean'}>,
<Axes: title={'center': 'transf_maxNesting_mean'}>, <Axes: >]],
dtype=object)
y_test.hist(bins=10, figsize=(20,15))
plt.show()
<Axes: >
Random forest models¶
X_train = transf_train_feat_dropped.drop(["mark", "score", 'readability_score', 'snippet_id'], axis=1)
y_train = transf_train_feat_dropped["score"]
RF = RandomForestRegressor(**transf_params, random_state=42)
RF.fit(X_train, y_train)
y = RF.predict(X_test)
print('Mean Squared Error:', root_mean_squared_error(y, y_test))
print('Mean Abcolute Error:', mean_absolute_error(y, y_test))
print('R2 Error:', r2_score(y, y_test))
RandomForestRegressor(max_depth=2, min_samples_leaf=10, min_samples_split=8,
n_estimators=4, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(max_depth=2, min_samples_leaf=10, min_samples_split=8,
n_estimators=4, random_state=42)Mean Squared Error: 0.13226372744025008 Mean Abcolute Error: 0.11712542588651287 R2 Error: -10.083828101212726
Neural networks models¶
class DeepLearningModel(nn.Module):
def __init__(self):
super(DeepLearningModel, self).__init__()
self.hidden1 = nn.Linear(X_train.shape[1], 128)
self.hidden2 = nn.Linear(128, 128)
self.hidden3 = nn.Linear(128, 128)
self.output = nn.Linear(128, 1)
self.dropout = nn.Dropout(0.1)
def forward(self, x):
x = torch.relu(self.hidden1(x))
x = self.dropout(x)
x = torch.relu(self.hidden2(x))
x = self.dropout(x)
x = torch.relu(self.hidden3(x))
x = self.dropout(x)
x = self.output(x)
return x
k_folds = 5
device = 'cpu'
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
train_losses_per_fold = []
val_losses_per_fold = []
fold_train_loss_history = []
fold_val_loss_history = []
fold_rmse_scores = []
fold_mae_scores = []
fold_r2_scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
print(f'FOLD {fold+1}/{k_folds}')
X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
X_train_tensor = torch.tensor(X_train_fold.to_numpy(), dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_fold.to_numpy(), dtype=torch.float32).view(-1, 1).to(device)
X_val_tensor = torch.tensor(X_val_fold.to_numpy(), dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_fold.to_numpy(), dtype=torch.float32).view(-1, 1).to(device)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
model = DeepLearningModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
train_loss_history = []
val_loss_history = []
num_epochs = 100
for epoch in range(num_epochs):
_ = model.train()
running_loss = 0.0
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
outputs = model(X_batch)
loss = torch.sqrt(criterion(outputs, y_batch))
loss.backward()
optimizer.step()
running_loss += loss.item()
avg_train_loss = running_loss / len(train_loader)
train_loss_history.append(avg_train_loss)
_ = model.eval()
with torch.no_grad():
val_predictions = model(X_val_tensor)
val_loss = torch.sqrt(criterion(val_predictions, y_val_tensor)).item()
val_loss_history.append(val_loss)
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}')
train_losses_per_fold.append(avg_train_loss)
val_losses_per_fold.append(val_loss)
val_predictions_np = val_predictions.cpu().numpy()
y_val_np = y_val_tensor.cpu().numpy()
rmse = np.sqrt(mean_squared_error(y_val_np, val_predictions_np))
mae = mean_absolute_error(y_val_np, val_predictions_np)
r2 = r2_score(y_val_np, val_predictions_np)
fold_rmse_scores.append(rmse)
fold_mae_scores.append(mae)
fold_r2_scores.append(r2)
print(f'Fold {fold+1} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}')
fold_train_loss_history.append(train_loss_history)
fold_val_loss_history.append(val_loss_history)
avg_train_loss_across_folds = np.mean(train_losses_per_fold)
avg_val_loss_across_folds = np.mean(val_losses_per_fold)
avg_rmse_across_folds = np.mean(fold_rmse_scores)
avg_mae_across_folds = np.mean(fold_mae_scores)
avg_r2_across_folds = np.mean(fold_r2_scores)
print(f'\nAverage Train Loss Across Folds: {avg_train_loss_across_folds:.4f}')
print(f'Average Validation Loss Across Folds: {avg_val_loss_across_folds:.4f}')
print(f'Average RMSE Across Folds: {avg_rmse_across_folds:.4f}')
print(f'Average MAE Across Folds: {avg_mae_across_folds:.4f}')
print(f'Average R2 Across Folds: {avg_r2_across_folds:.4f}')
FOLD 1/5 Epoch [1/100], Train Loss: 3.5314, Val Loss: 2.5486 Epoch [2/100], Train Loss: 1.7416, Val Loss: 0.6523 Epoch [3/100], Train Loss: 0.6986, Val Loss: 0.7095 Epoch [4/100], Train Loss: 0.6301, Val Loss: 0.3731 Epoch [5/100], Train Loss: 0.3620, Val Loss: 0.2600 Epoch [6/100], Train Loss: 0.3541, Val Loss: 0.1861 Epoch [7/100], Train Loss: 0.2719, Val Loss: 0.2842 Epoch [8/100], Train Loss: 0.3050, Val Loss: 0.1789 Epoch [9/100], Train Loss: 0.2269, Val Loss: 0.1836 Epoch [10/100], Train Loss: 0.2252, Val Loss: 0.1837 Epoch [11/100], Train Loss: 0.2034, Val Loss: 0.1986 Epoch [12/100], Train Loss: 0.2209, Val Loss: 0.1955 Epoch [13/100], Train Loss: 0.2001, Val Loss: 0.1670 Epoch [14/100], Train Loss: 0.2124, Val Loss: 0.1909 Epoch [15/100], Train Loss: 0.2175, Val Loss: 0.1991 Epoch [16/100], Train Loss: 0.1646, Val Loss: 0.1720 Epoch [17/100], Train Loss: 0.1800, Val Loss: 0.1619 Epoch [18/100], Train Loss: 0.1951, Val Loss: 0.1924 Epoch [19/100], Train Loss: 0.1930, Val Loss: 0.1527 Epoch [20/100], Train Loss: 0.1967, Val Loss: 0.1488 Epoch [21/100], Train Loss: 0.1892, Val Loss: 0.1811 Epoch [22/100], Train Loss: 0.1932, Val Loss: 0.1570 Epoch [23/100], Train Loss: 0.1600, Val Loss: 0.1520 Epoch [24/100], Train Loss: 0.1563, Val Loss: 0.1672 Epoch [25/100], Train Loss: 0.1820, Val Loss: 0.2164 Epoch [26/100], Train Loss: 0.1641, Val Loss: 0.1464 Epoch [27/100], Train Loss: 0.1724, Val Loss: 0.1496 Epoch [28/100], Train Loss: 0.1751, Val Loss: 0.1591 Epoch [29/100], Train Loss: 0.1410, Val Loss: 0.1571 Epoch [30/100], Train Loss: 0.1399, Val Loss: 0.1422 Epoch [31/100], Train Loss: 0.1424, Val Loss: 0.1413 Epoch [32/100], Train Loss: 0.1520, Val Loss: 0.1500 Epoch [33/100], Train Loss: 0.1483, Val Loss: 0.1418 Epoch [34/100], Train Loss: 0.1606, Val Loss: 0.1232 Epoch [35/100], Train Loss: 0.1690, Val Loss: 0.1589 Epoch [36/100], Train Loss: 0.1746, Val Loss: 0.1331 Epoch [37/100], Train Loss: 0.1636, Val Loss: 0.1206 Epoch [38/100], Train Loss: 0.1426, Val Loss: 0.1239 Epoch [39/100], Train Loss: 0.1558, Val Loss: 0.1421 Epoch [40/100], Train Loss: 0.1358, Val Loss: 0.1398 Epoch [41/100], Train Loss: 0.1392, Val Loss: 0.1284 Epoch [42/100], Train Loss: 0.1466, Val Loss: 0.1275 Epoch [43/100], Train Loss: 0.1563, Val Loss: 0.1277 Epoch [44/100], Train Loss: 0.1405, Val Loss: 0.1302 Epoch [45/100], Train Loss: 0.1482, Val Loss: 0.1192 Epoch [46/100], Train Loss: 0.1489, Val Loss: 0.1211 Epoch [47/100], Train Loss: 0.1376, Val Loss: 0.1370 Epoch [48/100], Train Loss: 0.1320, Val Loss: 0.1250 Epoch [49/100], Train Loss: 0.1112, Val Loss: 0.1382 Epoch [50/100], Train Loss: 0.1696, Val Loss: 0.1224 Epoch [51/100], Train Loss: 0.1480, Val Loss: 0.1464 Epoch [52/100], Train Loss: 0.1446, Val Loss: 0.1232 Epoch [53/100], Train Loss: 0.1212, Val Loss: 0.1337 Epoch [54/100], Train Loss: 0.1265, Val Loss: 0.1307 Epoch [55/100], Train Loss: 0.1604, Val Loss: 0.1383 Epoch [56/100], Train Loss: 0.1364, Val Loss: 0.1212 Epoch [57/100], Train Loss: 0.1483, Val Loss: 0.1168 Epoch [58/100], Train Loss: 0.1151, Val Loss: 0.1061 Epoch [59/100], Train Loss: 0.1507, Val Loss: 0.1069 Epoch [60/100], Train Loss: 0.1370, Val Loss: 0.1008 Epoch [61/100], Train Loss: 0.1515, Val Loss: 0.1134 Epoch [62/100], Train Loss: 0.1361, Val Loss: 0.1060 Epoch [63/100], Train Loss: 0.1316, Val Loss: 0.1095 Epoch [64/100], Train Loss: 0.1120, Val Loss: 0.1180 Epoch [65/100], Train Loss: 0.1257, Val Loss: 0.1086 Epoch [66/100], Train Loss: 0.1274, Val Loss: 0.1364 Epoch [67/100], Train Loss: 0.1467, Val Loss: 0.1078 Epoch [68/100], Train Loss: 0.1124, Val Loss: 0.1207 Epoch [69/100], Train Loss: 0.1153, Val Loss: 0.1170 Epoch [70/100], Train Loss: 0.1396, Val Loss: 0.1200 Epoch [71/100], Train Loss: 0.1304, Val Loss: 0.1099 Epoch [72/100], Train Loss: 0.1203, Val Loss: 0.1089 Epoch [73/100], Train Loss: 0.1233, Val Loss: 0.1092 Epoch [74/100], Train Loss: 0.1262, Val Loss: 0.1080 Epoch [75/100], Train Loss: 0.1258, Val Loss: 0.1057 Epoch [76/100], Train Loss: 0.1090, Val Loss: 0.1056 Epoch [77/100], Train Loss: 0.1143, Val Loss: 0.1032 Epoch [78/100], Train Loss: 0.1128, Val Loss: 0.1039 Epoch [79/100], Train Loss: 0.1082, Val Loss: 0.1053 Epoch [80/100], Train Loss: 0.1056, Val Loss: 0.1047 Epoch [81/100], Train Loss: 0.0971, Val Loss: 0.1035 Epoch [82/100], Train Loss: 0.1121, Val Loss: 0.1119 Epoch [83/100], Train Loss: 0.1126, Val Loss: 0.0994 Epoch [84/100], Train Loss: 0.1095, Val Loss: 0.0963 Epoch [85/100], Train Loss: 0.1004, Val Loss: 0.1004 Epoch [86/100], Train Loss: 0.1168, Val Loss: 0.1101 Epoch [87/100], Train Loss: 0.1029, Val Loss: 0.0995 Epoch [88/100], Train Loss: 0.1220, Val Loss: 0.1056 Epoch [89/100], Train Loss: 0.1090, Val Loss: 0.1013 Epoch [90/100], Train Loss: 0.1003, Val Loss: 0.1023 Epoch [91/100], Train Loss: 0.1065, Val Loss: 0.1014 Epoch [92/100], Train Loss: 0.1192, Val Loss: 0.1117 Epoch [93/100], Train Loss: 0.1119, Val Loss: 0.1657 Epoch [94/100], Train Loss: 0.1229, Val Loss: 0.1067 Epoch [95/100], Train Loss: 0.1085, Val Loss: 0.1209 Epoch [96/100], Train Loss: 0.1272, Val Loss: 0.1209 Epoch [97/100], Train Loss: 0.0989, Val Loss: 0.1248 Epoch [98/100], Train Loss: 0.1303, Val Loss: 0.1163 Epoch [99/100], Train Loss: 0.1142, Val Loss: 0.1122 Epoch [100/100], Train Loss: 0.1121, Val Loss: 0.1111 Fold 1 - RMSE: 0.1111, MAE: 0.0884, R²: 0.3929 FOLD 2/5 Epoch [1/100], Train Loss: 4.4640, Val Loss: 2.9585 Epoch [2/100], Train Loss: 1.8007, Val Loss: 1.4501 Epoch [3/100], Train Loss: 1.2299, Val Loss: 0.6181 Epoch [4/100], Train Loss: 0.4874, Val Loss: 0.3520 Epoch [5/100], Train Loss: 0.5300, Val Loss: 0.2016 Epoch [6/100], Train Loss: 0.3244, Val Loss: 0.4955 Epoch [7/100], Train Loss: 0.4360, Val Loss: 0.3095 Epoch [8/100], Train Loss: 0.2730, Val Loss: 0.1159 Epoch [9/100], Train Loss: 0.2379, Val Loss: 0.1850 Epoch [10/100], Train Loss: 0.2634, Val Loss: 0.2421 Epoch [11/100], Train Loss: 0.2121, Val Loss: 0.0875 Epoch [12/100], Train Loss: 0.2210, Val Loss: 0.0861 Epoch [13/100], Train Loss: 0.2119, Val Loss: 0.2013 Epoch [14/100], Train Loss: 0.2214, Val Loss: 0.2193 Epoch [15/100], Train Loss: 0.2167, Val Loss: 0.1023 Epoch [16/100], Train Loss: 0.1724, Val Loss: 0.1076 Epoch [17/100], Train Loss: 0.1941, Val Loss: 0.2622 Epoch [18/100], Train Loss: 0.2632, Val Loss: 0.3036 Epoch [19/100], Train Loss: 0.2365, Val Loss: 0.1913 Epoch [20/100], Train Loss: 0.1812, Val Loss: 0.0872 Epoch [21/100], Train Loss: 0.2211, Val Loss: 0.1191 Epoch [22/100], Train Loss: 0.2121, Val Loss: 0.2090 Epoch [23/100], Train Loss: 0.1736, Val Loss: 0.1900 Epoch [24/100], Train Loss: 0.1822, Val Loss: 0.1260 Epoch [25/100], Train Loss: 0.1968, Val Loss: 0.1107 Epoch [26/100], Train Loss: 0.1652, Val Loss: 0.1457 Epoch [27/100], Train Loss: 0.1947, Val Loss: 0.1603 Epoch [28/100], Train Loss: 0.1640, Val Loss: 0.1207 Epoch [29/100], Train Loss: 0.1861, Val Loss: 0.1235 Epoch [30/100], Train Loss: 0.1410, Val Loss: 0.1397 Epoch [31/100], Train Loss: 0.1588, Val Loss: 0.1416 Epoch [32/100], Train Loss: 0.1686, Val Loss: 0.1155 Epoch [33/100], Train Loss: 0.1811, Val Loss: 0.1532 Epoch [34/100], Train Loss: 0.1531, Val Loss: 0.1249 Epoch [35/100], Train Loss: 0.1590, Val Loss: 0.1108 Epoch [36/100], Train Loss: 0.1733, Val Loss: 0.1653 Epoch [37/100], Train Loss: 0.1380, Val Loss: 0.1566 Epoch [38/100], Train Loss: 0.1668, Val Loss: 0.1057 Epoch [39/100], Train Loss: 0.1533, Val Loss: 0.1198 Epoch [40/100], Train Loss: 0.1634, Val Loss: 0.1409 Epoch [41/100], Train Loss: 0.1254, Val Loss: 0.1428 Epoch [42/100], Train Loss: 0.1553, Val Loss: 0.1342 Epoch [43/100], Train Loss: 0.1699, Val Loss: 0.1027 Epoch [44/100], Train Loss: 0.1630, Val Loss: 0.0965 Epoch [45/100], Train Loss: 0.1573, Val Loss: 0.1569 Epoch [46/100], Train Loss: 0.1601, Val Loss: 0.1911 Epoch [47/100], Train Loss: 0.1537, Val Loss: 0.1078 Epoch [48/100], Train Loss: 0.1319, Val Loss: 0.1062 Epoch [49/100], Train Loss: 0.1210, Val Loss: 0.1837 Epoch [50/100], Train Loss: 0.1516, Val Loss: 0.1546 Epoch [51/100], Train Loss: 0.1516, Val Loss: 0.1048 Epoch [52/100], Train Loss: 0.1279, Val Loss: 0.1263 Epoch [53/100], Train Loss: 0.1368, Val Loss: 0.1287 Epoch [54/100], Train Loss: 0.1220, Val Loss: 0.1013 Epoch [55/100], Train Loss: 0.1370, Val Loss: 0.0838 Epoch [56/100], Train Loss: 0.1432, Val Loss: 0.1115 Epoch [57/100], Train Loss: 0.1368, Val Loss: 0.1598 Epoch [58/100], Train Loss: 0.1411, Val Loss: 0.1230 Epoch [59/100], Train Loss: 0.1405, Val Loss: 0.0764 Epoch [60/100], Train Loss: 0.1376, Val Loss: 0.1176 Epoch [61/100], Train Loss: 0.1549, Val Loss: 0.1922 Epoch [62/100], Train Loss: 0.1347, Val Loss: 0.1295 Epoch [63/100], Train Loss: 0.1366, Val Loss: 0.0882 Epoch [64/100], Train Loss: 0.1802, Val Loss: 0.1283 Epoch [65/100], Train Loss: 0.1271, Val Loss: 0.2111 Epoch [66/100], Train Loss: 0.1415, Val Loss: 0.1644 Epoch [67/100], Train Loss: 0.1261, Val Loss: 0.0823 Epoch [68/100], Train Loss: 0.1786, Val Loss: 0.0920 Epoch [69/100], Train Loss: 0.1259, Val Loss: 0.1686 Epoch [70/100], Train Loss: 0.1370, Val Loss: 0.1505 Epoch [71/100], Train Loss: 0.1217, Val Loss: 0.0868 Epoch [72/100], Train Loss: 0.1418, Val Loss: 0.0890 Epoch [73/100], Train Loss: 0.1359, Val Loss: 0.1519 Epoch [74/100], Train Loss: 0.1274, Val Loss: 0.1750 Epoch [75/100], Train Loss: 0.1445, Val Loss: 0.1355 Epoch [76/100], Train Loss: 0.1244, Val Loss: 0.1271 Epoch [77/100], Train Loss: 0.1120, Val Loss: 0.1370 Epoch [78/100], Train Loss: 0.1093, Val Loss: 0.1140 Epoch [79/100], Train Loss: 0.1112, Val Loss: 0.0960 Epoch [80/100], Train Loss: 0.1131, Val Loss: 0.1130 Epoch [81/100], Train Loss: 0.1209, Val Loss: 0.1144 Epoch [82/100], Train Loss: 0.1336, Val Loss: 0.1100 Epoch [83/100], Train Loss: 0.1258, Val Loss: 0.1325 Epoch [84/100], Train Loss: 0.1150, Val Loss: 0.1205 Epoch [85/100], Train Loss: 0.1174, Val Loss: 0.1012 Epoch [86/100], Train Loss: 0.1264, Val Loss: 0.1283 Epoch [87/100], Train Loss: 0.1238, Val Loss: 0.1228 Epoch [88/100], Train Loss: 0.1058, Val Loss: 0.1088 Epoch [89/100], Train Loss: 0.1102, Val Loss: 0.1186 Epoch [90/100], Train Loss: 0.1151, Val Loss: 0.1267 Epoch [91/100], Train Loss: 0.1176, Val Loss: 0.1138 Epoch [92/100], Train Loss: 0.1100, Val Loss: 0.1105 Epoch [93/100], Train Loss: 0.1127, Val Loss: 0.1164 Epoch [94/100], Train Loss: 0.1029, Val Loss: 0.1215 Epoch [95/100], Train Loss: 0.1006, Val Loss: 0.1171 Epoch [96/100], Train Loss: 0.1006, Val Loss: 0.1004 Epoch [97/100], Train Loss: 0.1175, Val Loss: 0.1096 Epoch [98/100], Train Loss: 0.1106, Val Loss: 0.1454 Epoch [99/100], Train Loss: 0.1086, Val Loss: 0.1429 Epoch [100/100], Train Loss: 0.1006, Val Loss: 0.1182 Fold 2 - RMSE: 0.1182, MAE: 0.1074, R²: -0.2511 FOLD 3/5 Epoch [1/100], Train Loss: 4.0082, Val Loss: 3.1206 Epoch [2/100], Train Loss: 1.9340, Val Loss: 1.6631 Epoch [3/100], Train Loss: 1.6439, Val Loss: 1.0224 Epoch [4/100], Train Loss: 0.6744, Val Loss: 0.6165 Epoch [5/100], Train Loss: 0.5496, Val Loss: 0.3094 Epoch [6/100], Train Loss: 0.2701, Val Loss: 0.2743 Epoch [7/100], Train Loss: 0.3314, Val Loss: 0.2213 Epoch [8/100], Train Loss: 0.2258, Val Loss: 0.2533 Epoch [9/100], Train Loss: 0.2995, Val Loss: 0.1789 Epoch [10/100], Train Loss: 0.1838, Val Loss: 0.1850 Epoch [11/100], Train Loss: 0.2529, Val Loss: 0.2000 Epoch [12/100], Train Loss: 0.2362, Val Loss: 0.1351 Epoch [13/100], Train Loss: 0.1895, Val Loss: 0.1976 Epoch [14/100], Train Loss: 0.2086, Val Loss: 0.1456 Epoch [15/100], Train Loss: 0.1849, Val Loss: 0.1402 Epoch [16/100], Train Loss: 0.1749, Val Loss: 0.1386 Epoch [17/100], Train Loss: 0.1783, Val Loss: 0.1397 Epoch [18/100], Train Loss: 0.1698, Val Loss: 0.1450 Epoch [19/100], Train Loss: 0.1910, Val Loss: 0.1441 Epoch [20/100], Train Loss: 0.1404, Val Loss: 0.1469 Epoch [21/100], Train Loss: 0.1665, Val Loss: 0.1463 Epoch [22/100], Train Loss: 0.1755, Val Loss: 0.1546 Epoch [23/100], Train Loss: 0.1874, Val Loss: 0.1459 Epoch [24/100], Train Loss: 0.1481, Val Loss: 0.1348 Epoch [25/100], Train Loss: 0.1706, Val Loss: 0.1368 Epoch [26/100], Train Loss: 0.1601, Val Loss: 0.1342 Epoch [27/100], Train Loss: 0.1566, Val Loss: 0.1357 Epoch [28/100], Train Loss: 0.1504, Val Loss: 0.1465 Epoch [29/100], Train Loss: 0.1528, Val Loss: 0.1477 Epoch [30/100], Train Loss: 0.1414, Val Loss: 0.1459 Epoch [31/100], Train Loss: 0.1473, Val Loss: 0.1605 Epoch [32/100], Train Loss: 0.1513, Val Loss: 0.1617 Epoch [33/100], Train Loss: 0.1583, Val Loss: 0.1595 Epoch [34/100], Train Loss: 0.1406, Val Loss: 0.1595 Epoch [35/100], Train Loss: 0.1564, Val Loss: 0.1588 Epoch [36/100], Train Loss: 0.1499, Val Loss: 0.1497 Epoch [37/100], Train Loss: 0.1438, Val Loss: 0.1412 Epoch [38/100], Train Loss: 0.1554, Val Loss: 0.1389 Epoch [39/100], Train Loss: 0.1444, Val Loss: 0.1396 Epoch [40/100], Train Loss: 0.1200, Val Loss: 0.1498 Epoch [41/100], Train Loss: 0.1446, Val Loss: 0.1440 Epoch [42/100], Train Loss: 0.1436, Val Loss: 0.1535 Epoch [43/100], Train Loss: 0.1438, Val Loss: 0.1524 Epoch [44/100], Train Loss: 0.1225, Val Loss: 0.1699 Epoch [45/100], Train Loss: 0.1316, Val Loss: 0.1549 Epoch [46/100], Train Loss: 0.1348, Val Loss: 0.1515 Epoch [47/100], Train Loss: 0.1121, Val Loss: 0.1522 Epoch [48/100], Train Loss: 0.1226, Val Loss: 0.1432 Epoch [49/100], Train Loss: 0.0999, Val Loss: 0.1276 Epoch [50/100], Train Loss: 0.1333, Val Loss: 0.1326 Epoch [51/100], Train Loss: 0.1266, Val Loss: 0.1302 Epoch [52/100], Train Loss: 0.1466, Val Loss: 0.1303 Epoch [53/100], Train Loss: 0.1245, Val Loss: 0.1341 Epoch [54/100], Train Loss: 0.1285, Val Loss: 0.1384 Epoch [55/100], Train Loss: 0.1115, Val Loss: 0.1412 Epoch [56/100], Train Loss: 0.1242, Val Loss: 0.1449 Epoch [57/100], Train Loss: 0.1211, Val Loss: 0.1490 Epoch [58/100], Train Loss: 0.1347, Val Loss: 0.1560 Epoch [59/100], Train Loss: 0.1082, Val Loss: 0.1434 Epoch [60/100], Train Loss: 0.1135, Val Loss: 0.1403 Epoch [61/100], Train Loss: 0.1294, Val Loss: 0.1387 Epoch [62/100], Train Loss: 0.1061, Val Loss: 0.1337 Epoch [63/100], Train Loss: 0.1053, Val Loss: 0.1288 Epoch [64/100], Train Loss: 0.1174, Val Loss: 0.1247 Epoch [65/100], Train Loss: 0.1247, Val Loss: 0.1282 Epoch [66/100], Train Loss: 0.1161, Val Loss: 0.1270 Epoch [67/100], Train Loss: 0.1103, Val Loss: 0.1249 Epoch [68/100], Train Loss: 0.0993, Val Loss: 0.1352 Epoch [69/100], Train Loss: 0.1075, Val Loss: 0.1530 Epoch [70/100], Train Loss: 0.1113, Val Loss: 0.1448 Epoch [71/100], Train Loss: 0.1105, Val Loss: 0.1421 Epoch [72/100], Train Loss: 0.0992, Val Loss: 0.1437 Epoch [73/100], Train Loss: 0.1173, Val Loss: 0.1691 Epoch [74/100], Train Loss: 0.1078, Val Loss: 0.1876 Epoch [75/100], Train Loss: 0.1124, Val Loss: 0.1506 Epoch [76/100], Train Loss: 0.1319, Val Loss: 0.1559 Epoch [77/100], Train Loss: 0.1350, Val Loss: 0.1733 Epoch [78/100], Train Loss: 0.1299, Val Loss: 0.1909 Epoch [79/100], Train Loss: 0.1237, Val Loss: 0.1358 Epoch [80/100], Train Loss: 0.1263, Val Loss: 0.1273 Epoch [81/100], Train Loss: 0.1002, Val Loss: 0.1264 Epoch [82/100], Train Loss: 0.0959, Val Loss: 0.1383 Epoch [83/100], Train Loss: 0.1074, Val Loss: 0.1236 Epoch [84/100], Train Loss: 0.1059, Val Loss: 0.1216 Epoch [85/100], Train Loss: 0.0949, Val Loss: 0.1322 Epoch [86/100], Train Loss: 0.0937, Val Loss: 0.1478 Epoch [87/100], Train Loss: 0.1216, Val Loss: 0.1416 Epoch [88/100], Train Loss: 0.0766, Val Loss: 0.1417 Epoch [89/100], Train Loss: 0.1046, Val Loss: 0.1424 Epoch [90/100], Train Loss: 0.1114, Val Loss: 0.1381 Epoch [91/100], Train Loss: 0.0953, Val Loss: 0.1429 Epoch [92/100], Train Loss: 0.1044, Val Loss: 0.1470 Epoch [93/100], Train Loss: 0.0959, Val Loss: 0.1416 Epoch [94/100], Train Loss: 0.1022, Val Loss: 0.1454 Epoch [95/100], Train Loss: 0.0983, Val Loss: 0.1608 Epoch [96/100], Train Loss: 0.1046, Val Loss: 0.1792 Epoch [97/100], Train Loss: 0.1124, Val Loss: 0.1558 Epoch [98/100], Train Loss: 0.1014, Val Loss: 0.1450 Epoch [99/100], Train Loss: 0.1057, Val Loss: 0.1472 Epoch [100/100], Train Loss: 0.1299, Val Loss: 0.1536 Fold 3 - RMSE: 0.1536, MAE: 0.1266, R²: -0.2685 FOLD 4/5 Epoch [1/100], Train Loss: 3.1437, Val Loss: 2.6159 Epoch [2/100], Train Loss: 1.4177, Val Loss: 1.1444 Epoch [3/100], Train Loss: 1.1139, Val Loss: 0.8058 Epoch [4/100], Train Loss: 0.6765, Val Loss: 0.3393 Epoch [5/100], Train Loss: 0.3170, Val Loss: 0.6629 Epoch [6/100], Train Loss: 0.4233, Val Loss: 0.3296 Epoch [7/100], Train Loss: 0.3310, Val Loss: 0.3694 Epoch [8/100], Train Loss: 0.3373, Val Loss: 0.2919 Epoch [9/100], Train Loss: 0.2323, Val Loss: 0.3675 Epoch [10/100], Train Loss: 0.2540, Val Loss: 0.2714 Epoch [11/100], Train Loss: 0.1984, Val Loss: 0.2852 Epoch [12/100], Train Loss: 0.2354, Val Loss: 0.2527 Epoch [13/100], Train Loss: 0.1815, Val Loss: 0.2557 Epoch [14/100], Train Loss: 0.1806, Val Loss: 0.2412 Epoch [15/100], Train Loss: 0.2036, Val Loss: 0.2475 Epoch [16/100], Train Loss: 0.1680, Val Loss: 0.2439 Epoch [17/100], Train Loss: 0.1988, Val Loss: 0.2405 Epoch [18/100], Train Loss: 0.1527, Val Loss: 0.2381 Epoch [19/100], Train Loss: 0.1746, Val Loss: 0.2524 Epoch [20/100], Train Loss: 0.1609, Val Loss: 0.2390 Epoch [21/100], Train Loss: 0.1677, Val Loss: 0.2361 Epoch [22/100], Train Loss: 0.1543, Val Loss: 0.2403 Epoch [23/100], Train Loss: 0.1711, Val Loss: 0.2378 Epoch [24/100], Train Loss: 0.1646, Val Loss: 0.2327 Epoch [25/100], Train Loss: 0.1497, Val Loss: 0.2294 Epoch [26/100], Train Loss: 0.1346, Val Loss: 0.2303 Epoch [27/100], Train Loss: 0.1362, Val Loss: 0.2257 Epoch [28/100], Train Loss: 0.1338, Val Loss: 0.2278 Epoch [29/100], Train Loss: 0.1261, Val Loss: 0.2359 Epoch [30/100], Train Loss: 0.1474, Val Loss: 0.2247 Epoch [31/100], Train Loss: 0.1455, Val Loss: 0.2252 Epoch [32/100], Train Loss: 0.1250, Val Loss: 0.2426 Epoch [33/100], Train Loss: 0.1316, Val Loss: 0.2324 Epoch [34/100], Train Loss: 0.1464, Val Loss: 0.2228 Epoch [35/100], Train Loss: 0.1373, Val Loss: 0.2272 Epoch [36/100], Train Loss: 0.1343, Val Loss: 0.2446 Epoch [37/100], Train Loss: 0.1510, Val Loss: 0.2173 Epoch [38/100], Train Loss: 0.1220, Val Loss: 0.2202 Epoch [39/100], Train Loss: 0.1175, Val Loss: 0.2196 Epoch [40/100], Train Loss: 0.1159, Val Loss: 0.2200 Epoch [41/100], Train Loss: 0.1374, Val Loss: 0.2156 Epoch [42/100], Train Loss: 0.1170, Val Loss: 0.2152 Epoch [43/100], Train Loss: 0.1221, Val Loss: 0.2334 Epoch [44/100], Train Loss: 0.0994, Val Loss: 0.2145 Epoch [45/100], Train Loss: 0.1299, Val Loss: 0.2167 Epoch [46/100], Train Loss: 0.1373, Val Loss: 0.2289 Epoch [47/100], Train Loss: 0.1256, Val Loss: 0.2159 Epoch [48/100], Train Loss: 0.1209, Val Loss: 0.2076 Epoch [49/100], Train Loss: 0.1235, Val Loss: 0.2177 Epoch [50/100], Train Loss: 0.1186, Val Loss: 0.2061 Epoch [51/100], Train Loss: 0.1276, Val Loss: 0.2071 Epoch [52/100], Train Loss: 0.1135, Val Loss: 0.2303 Epoch [53/100], Train Loss: 0.1213, Val Loss: 0.2064 Epoch [54/100], Train Loss: 0.1192, Val Loss: 0.2076 Epoch [55/100], Train Loss: 0.1329, Val Loss: 0.2053 Epoch [56/100], Train Loss: 0.1126, Val Loss: 0.2101 Epoch [57/100], Train Loss: 0.1186, Val Loss: 0.2005 Epoch [58/100], Train Loss: 0.1097, Val Loss: 0.2082 Epoch [59/100], Train Loss: 0.1321, Val Loss: 0.1993 Epoch [60/100], Train Loss: 0.1157, Val Loss: 0.2173 Epoch [61/100], Train Loss: 0.1088, Val Loss: 0.2096 Epoch [62/100], Train Loss: 0.1083, Val Loss: 0.1991 Epoch [63/100], Train Loss: 0.1041, Val Loss: 0.2113 Epoch [64/100], Train Loss: 0.1041, Val Loss: 0.2080 Epoch [65/100], Train Loss: 0.1040, Val Loss: 0.1990 Epoch [66/100], Train Loss: 0.1184, Val Loss: 0.2073 Epoch [67/100], Train Loss: 0.1077, Val Loss: 0.2032 Epoch [68/100], Train Loss: 0.1056, Val Loss: 0.1970 Epoch [69/100], Train Loss: 0.1070, Val Loss: 0.2110 Epoch [70/100], Train Loss: 0.1001, Val Loss: 0.2079 Epoch [71/100], Train Loss: 0.1161, Val Loss: 0.2065 Epoch [72/100], Train Loss: 0.1140, Val Loss: 0.2165 Epoch [73/100], Train Loss: 0.1058, Val Loss: 0.2025 Epoch [74/100], Train Loss: 0.0951, Val Loss: 0.2194 Epoch [75/100], Train Loss: 0.0976, Val Loss: 0.2088 Epoch [76/100], Train Loss: 0.0886, Val Loss: 0.1985 Epoch [77/100], Train Loss: 0.0961, Val Loss: 0.1940 Epoch [78/100], Train Loss: 0.1041, Val Loss: 0.1962 Epoch [79/100], Train Loss: 0.0806, Val Loss: 0.2055 Epoch [80/100], Train Loss: 0.1134, Val Loss: 0.2046 Epoch [81/100], Train Loss: 0.0795, Val Loss: 0.2066 Epoch [82/100], Train Loss: 0.1038, Val Loss: 0.2117 Epoch [83/100], Train Loss: 0.0928, Val Loss: 0.2039 Epoch [84/100], Train Loss: 0.1122, Val Loss: 0.2132 Epoch [85/100], Train Loss: 0.0993, Val Loss: 0.1939 Epoch [86/100], Train Loss: 0.1063, Val Loss: 0.1978 Epoch [87/100], Train Loss: 0.0802, Val Loss: 0.1920 Epoch [88/100], Train Loss: 0.1001, Val Loss: 0.1967 Epoch [89/100], Train Loss: 0.0868, Val Loss: 0.1943 Epoch [90/100], Train Loss: 0.0860, Val Loss: 0.1985 Epoch [91/100], Train Loss: 0.0937, Val Loss: 0.1994 Epoch [92/100], Train Loss: 0.0880, Val Loss: 0.1919 Epoch [93/100], Train Loss: 0.1219, Val Loss: 0.1929 Epoch [94/100], Train Loss: 0.0896, Val Loss: 0.2006 Epoch [95/100], Train Loss: 0.0960, Val Loss: 0.1935 Epoch [96/100], Train Loss: 0.1042, Val Loss: 0.1921 Epoch [97/100], Train Loss: 0.1036, Val Loss: 0.1936 Epoch [98/100], Train Loss: 0.0899, Val Loss: 0.1955 Epoch [99/100], Train Loss: 0.0987, Val Loss: 0.2142 Epoch [100/100], Train Loss: 0.1049, Val Loss: 0.1994 Fold 4 - RMSE: 0.1994, MAE: 0.1430, R²: -1.6530 FOLD 5/5 Epoch [1/100], Train Loss: 3.7462, Val Loss: 1.3402 Epoch [2/100], Train Loss: 1.3679, Val Loss: 1.1024 Epoch [3/100], Train Loss: 0.8625, Val Loss: 0.1978 Epoch [4/100], Train Loss: 0.4510, Val Loss: 0.1991 Epoch [5/100], Train Loss: 0.3327, Val Loss: 0.3698 Epoch [6/100], Train Loss: 0.3467, Val Loss: 0.4163 Epoch [7/100], Train Loss: 0.3171, Val Loss: 0.1787 Epoch [8/100], Train Loss: 0.3094, Val Loss: 0.1671 Epoch [9/100], Train Loss: 0.2757, Val Loss: 0.2860 Epoch [10/100], Train Loss: 0.2463, Val Loss: 0.2170 Epoch [11/100], Train Loss: 0.2645, Val Loss: 0.1224 Epoch [12/100], Train Loss: 0.1994, Val Loss: 0.1935 Epoch [13/100], Train Loss: 0.1955, Val Loss: 0.1626 Epoch [14/100], Train Loss: 0.1795, Val Loss: 0.0971 Epoch [15/100], Train Loss: 0.1986, Val Loss: 0.1254 Epoch [16/100], Train Loss: 0.2283, Val Loss: 0.2129 Epoch [17/100], Train Loss: 0.1930, Val Loss: 0.2231 Epoch [18/100], Train Loss: 0.1907, Val Loss: 0.1429 Epoch [19/100], Train Loss: 0.2205, Val Loss: 0.1309 Epoch [20/100], Train Loss: 0.2242, Val Loss: 0.2123 Epoch [21/100], Train Loss: 0.1952, Val Loss: 0.1960 Epoch [22/100], Train Loss: 0.1878, Val Loss: 0.1044 Epoch [23/100], Train Loss: 0.2366, Val Loss: 0.1304 Epoch [24/100], Train Loss: 0.1944, Val Loss: 0.2084 Epoch [25/100], Train Loss: 0.1994, Val Loss: 0.1629 Epoch [26/100], Train Loss: 0.1786, Val Loss: 0.1038 Epoch [27/100], Train Loss: 0.1846, Val Loss: 0.1654 Epoch [28/100], Train Loss: 0.1627, Val Loss: 0.2240 Epoch [29/100], Train Loss: 0.2064, Val Loss: 0.1872 Epoch [30/100], Train Loss: 0.1565, Val Loss: 0.1399 Epoch [31/100], Train Loss: 0.1440, Val Loss: 0.1502 Epoch [32/100], Train Loss: 0.1655, Val Loss: 0.2056 Epoch [33/100], Train Loss: 0.2014, Val Loss: 0.2183 Epoch [34/100], Train Loss: 0.1573, Val Loss: 0.1330 Epoch [35/100], Train Loss: 0.1701, Val Loss: 0.1279 Epoch [36/100], Train Loss: 0.1590, Val Loss: 0.1572 Epoch [37/100], Train Loss: 0.1575, Val Loss: 0.1665 Epoch [38/100], Train Loss: 0.1721, Val Loss: 0.1712 Epoch [39/100], Train Loss: 0.1342, Val Loss: 0.1473 Epoch [40/100], Train Loss: 0.1514, Val Loss: 0.1317 Epoch [41/100], Train Loss: 0.1451, Val Loss: 0.1117 Epoch [42/100], Train Loss: 0.1849, Val Loss: 0.1377 Epoch [43/100], Train Loss: 0.1529, Val Loss: 0.1691 Epoch [44/100], Train Loss: 0.1189, Val Loss: 0.1657 Epoch [45/100], Train Loss: 0.1532, Val Loss: 0.1232 Epoch [46/100], Train Loss: 0.1210, Val Loss: 0.1686 Epoch [47/100], Train Loss: 0.1619, Val Loss: 0.1729 Epoch [48/100], Train Loss: 0.1321, Val Loss: 0.1338 Epoch [49/100], Train Loss: 0.1462, Val Loss: 0.1652 Epoch [50/100], Train Loss: 0.1408, Val Loss: 0.1555 Epoch [51/100], Train Loss: 0.1652, Val Loss: 0.1473 Epoch [52/100], Train Loss: 0.1266, Val Loss: 0.1214 Epoch [53/100], Train Loss: 0.1404, Val Loss: 0.1133 Epoch [54/100], Train Loss: 0.1660, Val Loss: 0.1336 Epoch [55/100], Train Loss: 0.1540, Val Loss: 0.1378 Epoch [56/100], Train Loss: 0.1494, Val Loss: 0.1321 Epoch [57/100], Train Loss: 0.1231, Val Loss: 0.1324 Epoch [58/100], Train Loss: 0.1254, Val Loss: 0.1104 Epoch [59/100], Train Loss: 0.1273, Val Loss: 0.1188 Epoch [60/100], Train Loss: 0.1344, Val Loss: 0.1416 Epoch [61/100], Train Loss: 0.1524, Val Loss: 0.1299 Epoch [62/100], Train Loss: 0.1377, Val Loss: 0.1269 Epoch [63/100], Train Loss: 0.1396, Val Loss: 0.1744 Epoch [64/100], Train Loss: 0.1439, Val Loss: 0.1516 Epoch [65/100], Train Loss: 0.1272, Val Loss: 0.1216 Epoch [66/100], Train Loss: 0.1395, Val Loss: 0.1189 Epoch [67/100], Train Loss: 0.1365, Val Loss: 0.1395 Epoch [68/100], Train Loss: 0.1225, Val Loss: 0.1057 Epoch [69/100], Train Loss: 0.1326, Val Loss: 0.1249 Epoch [70/100], Train Loss: 0.1361, Val Loss: 0.1653 Epoch [71/100], Train Loss: 0.1194, Val Loss: 0.1340 Epoch [72/100], Train Loss: 0.1324, Val Loss: 0.1444 Epoch [73/100], Train Loss: 0.1271, Val Loss: 0.1500 Epoch [74/100], Train Loss: 0.1277, Val Loss: 0.1174 Epoch [75/100], Train Loss: 0.1520, Val Loss: 0.1135 Epoch [76/100], Train Loss: 0.1252, Val Loss: 0.1674 Epoch [77/100], Train Loss: 0.1242, Val Loss: 0.1374 Epoch [78/100], Train Loss: 0.1140, Val Loss: 0.1187 Epoch [79/100], Train Loss: 0.1118, Val Loss: 0.1178 Epoch [80/100], Train Loss: 0.1073, Val Loss: 0.1141 Epoch [81/100], Train Loss: 0.1168, Val Loss: 0.1155 Epoch [82/100], Train Loss: 0.0930, Val Loss: 0.1084 Epoch [83/100], Train Loss: 0.1265, Val Loss: 0.1366 Epoch [84/100], Train Loss: 0.1262, Val Loss: 0.1188 Epoch [85/100], Train Loss: 0.1163, Val Loss: 0.1168 Epoch [86/100], Train Loss: 0.1136, Val Loss: 0.1410 Epoch [87/100], Train Loss: 0.1255, Val Loss: 0.1128 Epoch [88/100], Train Loss: 0.1159, Val Loss: 0.1108 Epoch [89/100], Train Loss: 0.1129, Val Loss: 0.1279 Epoch [90/100], Train Loss: 0.1108, Val Loss: 0.1366 Epoch [91/100], Train Loss: 0.1261, Val Loss: 0.1151 Epoch [92/100], Train Loss: 0.1192, Val Loss: 0.1246 Epoch [93/100], Train Loss: 0.1149, Val Loss: 0.1226 Epoch [94/100], Train Loss: 0.1365, Val Loss: 0.1094 Epoch [95/100], Train Loss: 0.1237, Val Loss: 0.1207 Epoch [96/100], Train Loss: 0.1217, Val Loss: 0.1725 Epoch [97/100], Train Loss: 0.1257, Val Loss: 0.1612 Epoch [98/100], Train Loss: 0.1404, Val Loss: 0.1408 Epoch [99/100], Train Loss: 0.1298, Val Loss: 0.1318 Epoch [100/100], Train Loss: 0.1254, Val Loss: 0.1213 Fold 5 - RMSE: 0.1213, MAE: 0.1010, R²: -1.7029 Average Train Loss Across Folds: 0.1146 Average Validation Loss Across Folds: 0.1407 Average RMSE Across Folds: 0.1407 Average MAE Across Folds: 0.1133 Average R2 Across Folds: -0.6965
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32).view(-1, 1).to(device)
_ = model.eval()
with torch.no_grad():
test_predictions = model(X_test_tensor)
rmse = np.sqrt(mean_squared_error(y_test_tensor, test_predictions))
mae = mean_absolute_error(y_test_tensor, test_predictions)
r2 = r2_score(y_test_tensor, test_predictions)
print(f'RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}')
RMSE: 0.1641, MAE: 0.1373, R2: -0.2829
Comparing with existed¶
res_y_test = test['score']
print('Mean Squared Error:', root_mean_squared_error(res_y_test, test['readability_score']))
print('Mean Abcolute Error:', mean_absolute_error(res_y_test, test['readability_score']))
print('R2 Error:', r2_score(res_y_test, test['readability_score']))
Mean Squared Error: 0.36903913635230495 Mean Abcolute Error: 0.3423734234794974 R2 Error: -5.486468096763911
Test on other datasets¶
%reload_ext autoreload
scalabrino = pd.read_csv('data/Scalabrino_prepared.csv')
summarize_dataset(scalabrino)
| Feature | Unique Count | NaN Count | Nulls Count | Type | Min | Max | Most Frequent | Least Frequent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | file | 200 | 0 | (0,) | object | Snippet1 | Snippet99 | Snippet1 | Snippet99 |
| 1 | lines_min | 34 | 0 | (0,) | int64 | 9 | 42 | 12 | None |
| 2 | lines_max | 34 | 0 | (0,) | int64 | 9 | 42 | 12 | None |
| 3 | lines_mean | 34 | 0 | (0,) | float64 | 9.0 | 42.0 | 12.0 | None |
| 4 | lines_count | 1 | 0 | (0,) | int64 | 1 | 1 | 1 | None |
| 5 | maxNesting_min | 9 | 0 | (75,) | int64 | 0 | 8 | 0 | None |
| 6 | maxNesting_max | 9 | 0 | (75,) | int64 | 0 | 8 | 0 | None |
| 7 | maxNesting_mean | 9 | 0 | (75,) | float64 | 0.0 | 8.0 | 0.0 | None |
| 8 | maxAstDistance_min | 13 | 0 | (18,) | int64 | 0 | 15 | 3 | None |
| 9 | maxAstDistance_max | 13 | 0 | (18,) | int64 | 0 | 15 | 3 | None |
| 10 | maxAstDistance_mean | 13 | 0 | (18,) | float64 | 0.0 | 15.0 | 3.0 | None |
| 11 | halstead_min | 196 | 0 | (0,) | float64 | 95.183873 | 2976.685402 | 298.056005 | None |
| 12 | halstead_max | 196 | 0 | (0,) | float64 | 95.183873 | 2976.685402 | 298.056005 | None |
| 13 | halstead_mean | 196 | 0 | (0,) | float64 | 95.183873 | 2976.685402 | 298.056005 | None |
| 14 | entropy_min | 199 | 0 | (0,) | float64 | 4.493546 | 15.09684 | 7.422042 | None |
| 15 | entropy_max | 199 | 0 | (0,) | float64 | 4.493546 | 15.09684 | 7.422042 | None |
| 16 | entropy_mean | 199 | 0 | (0,) | float64 | 4.493546 | 15.09684 | 7.422042 | None |
| 17 | sum_feat_min | 199 | 0 | (0,) | float64 | 8.312731 | 106.994859 | 19.092378 | None |
| 18 | sum_feat_max | 199 | 0 | (0,) | float64 | 8.312731 | 106.994859 | 19.092378 | None |
| 19 | sum_feat_mean | 199 | 0 | (0,) | float64 | 8.312731 | 106.994859 | 19.092378 | None |
| 20 | mult_feat_min | 199 | 0 | (0,) | float64 | 36.240462 | 1031.546209 | 138.261511 | None |
| 21 | mult_feat_max | 199 | 0 | (0,) | float64 | 36.240462 | 1031.546209 | 138.261511 | None |
| 22 | mult_feat_mean | 199 | 0 | (0,) | float64 | 36.240462 | 1031.546209 | 138.261511 | None |
| 23 | score | 26 | 0 | (0,) | float64 | 0.377778 | 0.977778 | 0.777778 | None |
X_test_scalabrino, y_test_scalabrino = prepare_dataset(scalabrino, columns_to_transform, transf_features_to_emb, transf_plot_standard_embeddings, columns_to_train)
Наша модель:
y_scalabrino = RF.predict(X_test_scalabrino)
print('Mean Squared Error:', root_mean_squared_error(y_test_scalabrino, y_scalabrino))
print('Mean Abcolute Error:', mean_absolute_error(y_test_scalabrino, y_scalabrino))
print('R2 Error:', r2_score(y_test_scalabrino, y_scalabrino))
Mean Squared Error: 0.1319638326212417 Mean Abcolute Error: 0.10853167927944464 R2 Error: -0.2443679428131582
Cуществующая:
scalabrino_features = pd.read_csv('data/Scalabrino_readability_features.csv')
y_scalabrino = scalabrino_features['readability_score']
print('Mean Squared Error:', root_mean_squared_error(y_test_scalabrino, y_scalabrino))
print('Mean Abcolute Error:', mean_absolute_error(y_test_scalabrino, y_scalabrino))
print('R2 Error:', r2_score(y_test_scalabrino, y_scalabrino))
Mean Squared Error: 0.5773720954263518 Mean Abcolute Error: 0.5163915095159056 R2 Error: -22.820482533618144
Agnia's Dataset
agnia = pd.read_csv('data/Agnia_prepared.csv')
summarize_dataset(agnia)
| Feature | Unique Count | NaN Count | Nulls Count | Type | Min | Max | Most Frequent | Least Frequent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | file | 119 | 0 | (0,) | object | b10 | i9 | b9 | e45 |
| 1 | lines_min | 33 | 0 | (0,) | int64 | 3 | 48 | 7 | None |
| 2 | lines_max | 31 | 0 | (0,) | int64 | 4 | 48 | 11 | None |
| 3 | lines_mean | 45 | 0 | (0,) | float64 | 4.0 | 48.0 | 8.0 | None |
| 4 | lines_count | 5 | 0 | (0,) | int64 | 1 | 5 | 1 | None |
| 5 | maxNesting_min | 7 | 0 | (35,) | int64 | 0 | 6 | 1 | None |
| 6 | maxNesting_max | 7 | 0 | (16,) | int64 | 0 | 6 | 2 | None |
| 7 | maxNesting_mean | 17 | 0 | (16,) | float64 | 0.0 | 6.0 | 2.0 | None |
| 8 | maxAstDistance_min | 12 | 0 | (3,) | int64 | 0 | 17 | 3 | None |
| 9 | maxAstDistance_max | 11 | 0 | (3,) | int64 | 0 | 17 | 4 | None |
| 10 | maxAstDistance_mean | 24 | 0 | (3,) | float64 | 0.0 | 17.0 | 7.0 | None |
| 11 | halstead_min | 103 | 0 | (0,) | float64 | 114.694043 | 2026.910254 | 155.589411 | None |
| 12 | halstead_max | 107 | 0 | (0,) | float64 | 133.782949 | 2026.910254 | 278.826585 | None |
| 13 | halstead_mean | 109 | 0 | (0,) | float64 | 133.782949 | 2026.910254 | 213.145246 | None |
| 14 | entropy_min | 107 | 0 | (0,) | float64 | 4.944166 | 15.234374 | 6.869619 | None |
| 15 | entropy_max | 110 | 0 | (0,) | float64 | 5.285144 | 15.234374 | 7.115813 | None |
| 16 | entropy_mean | 112 | 0 | (0,) | float64 | 5.285144 | 15.234374 | 6.41018 | None |
| 17 | sum_feat_min | 109 | 0 | (0,) | float64 | 17.898785 | 74.178606 | 20.186674 | None |
| 18 | sum_feat_max | 112 | 0 | (0,) | float64 | 17.898785 | 74.178606 | 32.133341 | None |
| 19 | sum_feat_mean | 113 | 0 | (0,) | float64 | 17.898785 | 74.178606 | 27.433546 | None |
| 20 | mult_feat_min | 108 | 0 | (0,) | float64 | 114.824632 | 739.270143 | 114.824632 | None |
| 21 | mult_feat_max | 109 | 0 | (0,) | float64 | 116.908146 | 786.969127 | 193.995312 | None |
| 22 | mult_feat_mean | 113 | 0 | (0,) | float64 | 116.908146 | 739.270143 | 170.52832 | None |
| 23 | score | 79 | 0 | (0,) | float64 | 0.066667 | 1.0 | 0.904762 | None |
X_test_agnia, y_test_agnia = prepare_dataset(agnia, columns_to_transform, transf_features_to_emb, transf_plot_standard_embeddings, columns_to_train)
Наша модель:
y_agnia = RF.predict(X_test_agnia)
print('Mean Squared Error:', root_mean_squared_error(y_test_agnia, y_agnia))
print('Mean Abcolute Error:', mean_absolute_error(y_test_agnia, y_agnia))
print('R2 Error:', r2_score(y_test_agnia, y_agnia))
Mean Squared Error: 0.19651283170139683 Mean Abcolute Error: 0.1650922338911641 R2 Error: -0.009416976503046515
Существующая:
agnia_features = pd.read_csv("data/Agnia_readability_features.csv")
agnia = pd.merge(agnia, agnia_features, on='file')
y_agnia = agnia['readability_score']
print('Mean Squared Error:', root_mean_squared_error(y_agnia, y_test_agnia))
print('Mean Abcolute Error:', mean_absolute_error(y_agnia, y_test_agnia))
print('R2 Error:', r2_score(y_test_agnia, y_agnia))
Mean Squared Error: 0.6546187962862606 Mean Abcolute Error: 0.5876805744012243 R2 Error: -10.20123011425795
Summary¶
Проблема 1. Как получить оценку для сниппета?
Решение 0: Посчитать среднюю оценку и конвертировать ее в оценку от 0 до 1.
Решение 1: Если в паре сниппет получил оценку лучше, то назначить ему 1, иначе 0. Посчитать среднюю оценку.
Использую здесь решение 1 поскольку всего 17 против 28 уникальных значений читаемости.
Проблема 2. Как получить оценку для методов в сниппете?
Решение 0: Имея оценки для сниппетов я говорю, что эта оценка для всех методов в сниппете.
Решение 1: Оставляем оценки для сниппетов, составляем аггрегационные данные(мин, макс, среднее, медианное) для фич для методов. И количество методов для каждого сниппета. Обучаем и тестируем на сниппетах. Но тогда применение лучше сделать на уровне класса, а не метода.
Решение 2: Придумать функцию - предположение о читаемости сниппета от читаемости методов. Cреднее геометрическое или что-то вроде. Но слишком ненадежно полагаться на ощущения.
Решение 3: Покопаться с +OOB или небольшим трансфорером который бы обучился на 3 датасетах и разметил методы внутри нашего.
Я начала делать с решения 0, но когда стала смотреть на результаты, то выяснила, что в одном сниппете зачастую встречается много маленьких или большой и маленький. И интерполировать оценку со всего сниппета на этот метод не совсем верно. Поэтому в этом ноутбуке я исопользую решение 1 пытаюсь получить все возможное из 70 снипптов.
Проблема 3: Парность данных. Во-первых, чтобы не было data leak нельзя разделять пары на train и test. Во-вторых, скорее всего есть небольшое искажение в оценках от читателей, так как они видели пары, а не единичные сниппеты. Возможно наш датасет в дальнейшем больше подойдет для оценки/предложений по улучшений читаемости для pull requestoв, откуда мы его и взяли.
Проблема 4: Другие классические модельки. Я немного времени потратила на XGBRegressor, LGBMRegressor, но прироста или в стабильности на других данных, они не дали, также, как и nn.
Результат: На тестовом датасете модель ведет себя прилично. Чуть хуже на скалабрино. На датасете Агнии выдает худний скор - возможно из-за того, что он более распределен на признаках, как видно на графике, возможно из-за того, что он сдвинут по оценкам (от 0.66 до 1). В любом случае все 3 датасета стоит добавить для обучения результируюшей модели.